Script to combine gene-by-sample expression matrices from different datasets. It requires manually prepared dataset file with four columns:
1. *Dataset_name* - user defined names of datasets
2. *Expression_matrix* - corresponding expression/count matrices
3. *Target_file* - defining the sample names for merged expression matrix
4. *Outliers_file* - listing outlier samples to be removed before combining the data.
Note, only genes intersection across all datasets expression matrices will be reported in the combined expression matrix. The pipeline is based on recommendaitons from RNAseq123 package.
##### Create 'not in' operator
"%!in%" <- function(x,table) match(x,table, nomatch = 0) == 0
##### Prepare object to write into a file
prepare2write <- function (x) {
x2write <- cbind(rownames(x), x)
colnames(x2write) <- c("Gene",colnames(x))
return(x2write)
}
##### Prepare gene data matrix to write into a file
geneMatrix2write <- function (x) {
x2write <- cbind(rownames(x), x)
colnames(x2write) <- c("Gene",colnames(x))
return(x2write)
}
##### Assign colours to different groups
getTargetsColours <- function(targets) {
##### Predefined selection of colours for groups
targets.colours <- c("red","blue","green","darkgoldenrod","darkred","deepskyblue", "coral", "cornflowerblue", "chartreuse4", "bisque4", "chocolate3", "cadetblue3", "darkslategrey", "lightgoldenrod4", "mediumpurple4", "orangered3","indianred1","blueviolet","darkolivegreen4","darkgoldenrod4","firebrick3","deepskyblue4", "coral3", "dodgerblue1", "chartreuse3", "bisque3", "chocolate4", "cadetblue", "darkslategray4", "lightgoldenrod3", "mediumpurple3", "orangered1")
f.targets <- factor(targets)
vec.targets <- targets.colours[1:length(levels(f.targets))]
targets.colour <- rep(0,length(f.targets))
for(i in 1:length(f.targets))
targets.colour[i] <- vec.targets[ f.targets[i]==levels(f.targets)]
return( list(vec.targets, targets.colour) )
}
##### Assign colours to different datasets
getDatasetsColours <- function(datasets) {
##### Predefined selection of colours for datasets
datasets.colours <- c("dodgerblue","firebrick","lightslategrey","darkseagreen","orange","darkcyan","bisque", "coral2", "cadetblue3","red","blue","green")
f.datasets <- factor(datasets)
vec.datasets <- datasets.colours[1:length(levels(f.datasets))]
datasets.colour <- rep(0,length(f.datasets))
for(i in 1:length(f.datasets))
datasets.colour[i] <- vec.datasets[ f.datasets[i]==levels(f.datasets)]
return( list(vec.datasets, datasets.colour) )
}
##### Assign symbols to different datasets (for scatterplots)
getDatasetsSymbols <- function(datasets) {
##### Predefined selection of symbols for datasets
datasets.symbols <- c("circle","square","diamond","cross","circle-open","square-open","diamond-open")
f.datasets <- factor(datasets)
vec.datasets <- datasets.symbols[1:length(levels(f.datasets))]
datasets.symbols <- rep(0,length(f.datasets))
for(i in 1:length(f.datasets))
datasets.symbols[i] <- vec.datasets[ f.datasets[i]==levels(f.datasets)]
return( list(vec.datasets, datasets.symbols) )
}
suppressMessages(library(preprocessCore))
suppressMessages(library(rapportools))
suppressMessages(library(sva))
suppressMessages(library(RUVSeq))
## Warning: package 'GenomicRanges' was built under R version 3.5.1
## Warning: package 'DelayedArray' was built under R version 3.5.1
suppressMessages(library(sneer))
suppressMessages(library(FactoMineR))
suppressMessages(library(plotly))
suppressMessages(library(edgeR))
suppressMessages(library(corrplot))
suppressMessages(library(RSkittleBrewer))
trop <- RSkittleBrewer("tropical")
Read in the dataset file containing inforamtion about the read count matrices to be combined along with the corresponding target and outliers files. Then merge the read count matrices based on the information in the dataset file Note that only genes intersection across all datasets expression matrices will be reported in the combined expression matrix. Genes not present across all datasets will be listed in [datasets].missing_genes.txt file.
##### Read file with datasets information
DatasetInput=read.table(paste(params$projectDir,params$datasetsFile,sep="/"),sep="\t", as.is=TRUE, header=TRUE, row.names=1)
##### Extract info about target file for the first dataset
fileInfo = strsplit(DatasetInput[,"Target_file"], split='/', fixed=TRUE)
targetFile <- read.table(DatasetInput[1,"Target_file"], sep="\t", as.is=TRUE, header=TRUE)[,c(1:4)]
##### Make sure that there are no duplciated samples in the target file
targetFile <- targetFile[!duplicated(targetFile[,"Sample_name"]),]
rownames(targetFile) <- targetFile[,"Sample_name"]
targetFile <- cbind(targetFile[,2:4],rownames(DatasetInput[1,]))
colnames(targetFile)[ncol(targetFile)] <- "Dataset"
if ( nrow(DatasetInput) > 1 ) {
for ( i in 2:nrow(DatasetInput) ) {
##### Create a temporary object to store info from the remaining target files
targetFileTmp <- read.table(DatasetInput[i,"Target_file"], sep="\t", as.is=TRUE, header=TRUE)[,c(1:4)]
##### Make sure that there are no duplciated samples in the target file
targetFileTmp <- targetFileTmp[!duplicated(targetFileTmp[,"Sample_name"]),]
rownames(targetFileTmp) <- targetFileTmp[,"Sample_name"]
targetFileTmp <- cbind(targetFileTmp[,2:4],rownames(DatasetInput[i,]))
colnames(targetFileTmp)[ncol(targetFileTmp)] <- "Dataset"
##### Deal with replicates
if ( any(!is.na(targetFile[,"Replicates"])) ) {
maxRep <- max(targetFile[!is.na(targetFile[,"Replicates"]),"Replicates"])
targetFileTmp[,"Replicates"] <- targetFileTmp[,"Replicates"] + maxRep
}
targetFile <- rbind(targetFile, targetFileTmp)
}
}
##### Make syntactically valid names
rownames(targetFile) <- make.names(rownames(targetFile))
##### Loop through the gene-by-sample expression matrices from different datasets and merge them into a matrix
for (data_matrix in DatasetInput[ , "Expression_matrix" ] ) {
##### Create combined dataset variable if it doesn't exist yet
if (!exists("datasets")) {
datasets <- as.data.frame( read.table(data_matrix, header=TRUE, sep="\t", row.names=NULL) )
##### list genes present in individal files
gene_list <- as.vector(datasets[,1])
##### Add data for the remaining samples
} else if (exists("datasets")) {
dataset <-as.data.frame( read.table(data_matrix, header=TRUE, sep="\t", row.names=NULL) )
##### list genes present in individal files
gene_list <- c( gene_list, as.vector(dataset[,1]) )
##### Merge the expression data and make sure that the genes order is the same
datasets <- merge( datasets, dataset, by=1, all = FALSE, sort= TRUE)
##### Remove per-sample data for merged samples to free some memory
rm(dataset)
}
}
##### Use gene IDs as rownames
rownames(datasets) <- datasets[,1]
datasets <- datasets[, -1]
##### Make syntactically valid names
colnames(datasets) <- make.names(colnames(datasets))
##### Make sure that the target file contains info only about samples present in the data matrix
targetFile <- targetFile[ rownames(targetFile) %in% colnames(datasets), ]
##### Make sure that the samples order in the data matrix is the same as in the target file
datasets <- datasets[ , rownames(targetFile) ]
##### Save datasets names
datasetIDs <- rownames(DatasetInput)
##### Identify genes that were not present across all per-sampel files and were ommited in the merged matrix
gene_list <- unique(gene_list)
gene_list.missing <- gene_list[ gene_list %!in% rownames(datasets) ]
##### Write list of missing genes into a file
if ( length(gene_list.missing) > 0 ) {
write.table(prepare2write(gene_list.missing), file = paste0(params$projectDir, "/", paste(datasetIDs, collapse="_"),".missing_genes.txt"), sep="\t", quote=FALSE, row.names=TRUE, append = FALSE )
}
Remove user-defined outliers from individual datasets listed in outliers files.
outlierList <- NULL
j<-1
##### Read outliers file for each dataset and list them
for ( i in 1:length(datasetIDs) ) {
##### Check if the outliers file is reported. If not, assume that there are not outliers in the corresponding dataset
if ( !is.empty(DatasetInput[ i, "Outliers_file"]) ) {
##### Check if the outliers file is not empty
FileInfo = file.info(DatasetInput[ i, "Outliers_file"])
##### If non-empty then read it in
if ( FileInfo$size != 0 ) {
outliersInfo = read.table(DatasetInput[ i, "Outliers_file"], sep="\t", as.is=TRUE, header=FALSE, row.names=NULL)
outliers=as.vector(unlist(outliersInfo))
for (k in 1:length(outliers)) {
cat(paste("Removing sample:", outliers[k], "\n", sep=" "))
outlierList[j] <- outliers[k]
j<-j+1
}
}
}
}
##### Make syntactically valid names
outlierList <- make.names(outlierList)
##### Remove outliers from the combined expression matrix and target file
datasets <- datasets[, setdiff(colnames(datasets), outlierList) ]
targetFile <- targetFile[ setdiff(rownames(targetFile) , outlierList) ,]
Bar-plot illustrating library size for each sample (bar). The colours indicate datasets.
suppressMessages(library(plotly))
##### Assigne colours to targets and datasets
targets.colour <- getTargetsColours(targetFile$Target)
datasets.colour <- getDatasetsColours(targetFile$Dataset)
##### Assign symbopls according to defined datasets
datasets.symbols <- getDatasetsSymbols(targetFile$Dataset)
##### Generate bar-plot for library size
##### Prepare data frame
datasets.df <- data.frame(targetFile$Dataset, colnames(datasets), as.numeric(colSums(datasets)*1e-6))
colnames(datasets.df) <- c("Dataset","Sample", "Library_size")
##### The default order will be alphabetized unless specified as below
datasets.df$Sample <- factor(datasets.df$Sample, levels = datasets.df[["Sample"]])
p <- plot_ly(datasets.df, x = ~Sample, y = ~Library_size, color = ~Dataset, type = 'bar', colors = datasets.colour[[1]], width = 1000, height = 400) %>%
layout(title = "", xaxis = list( tickfont = list(size = 10), title = ""), yaxis = list(title = "Library size (millions)"), margin = list(l=50, r=50, b=150, t=50, pad=4), autosize = F, legend = list(orientation = 'v', y = 0.5), showlegend=TRUE)
##### Print htmlwidget
p
##### Save the bar-plot as html
htmlwidgets::saveWidget(as_widget(p), paste0(normalizePath(params$projectDir), "/", paste(datasetIDs, collapse="_"),"_RNAseq_libSize_datasets.html"), selfcontained = TRUE)
Bar-plot illustrating library size for each sample (bar). The colours indicate sample groups, as provided in Target column in the target files.
##### Generate bar-plot for library size
##### Prepare data frame
datasets.df <- data.frame(targetFile$Target, colnames(datasets), as.numeric(colSums(datasets)*1e-6))
colnames(datasets.df) <- c("Group","Sample", "Library_size")
##### The default order will be alphabetized unless specified as below
datasets.df$Sample <- factor(datasets.df$Sample, levels = datasets.df[["Sample"]])
p <- plot_ly(datasets.df, x = ~Sample, y = ~Library_size, color = ~Group, type = 'bar', colors = targets.colour[[1]], width = 1000, height = 400) %>%
layout(title = "", xaxis = list( tickfont = list(size = 10), title = ""), yaxis = list(title = "Library size (millions)"), margin = list(l=50, r=50, b=150, t=50, pad=4), autosize = F, legend = list(orientation = 'v', y = 0.5), showlegend=TRUE)
##### Print htmlwidget
p
##### Save the bar-plot as html (PLOTLY)
htmlwidgets::saveWidget(as_widget(p), paste0(normalizePath(params$projectDir), "/", paste(datasetIDs, collapse="_"),"_RNAseq_libSize_targets.html"), selfcontained = TRUE)
##### Detach plotly package. Otherwise it clashes with other graphics devices
detach("package:plotly", unload=FALSE)
For differential expression and related analyses, gene expression is rarely considered at the level of raw counts since libraries sequenced at a greater depth will result in higher counts. Rather, it is common practice to transform raw counts onto a scale that accounts for such library size differences. Here we convert the read count data into log2-counts per million (log-CPM) using function from edgeR package. Genes with very low counts across all libraries provide little evidence for differential expression. In the biological point of view, a gene must be expressed at some minimal level before it is likely to be translated into a protein or to be biologically important. In addition, the pronounced discretenes of these counts interferes with some of the statistical approximations that are used later in the pipeline. These genes should be filtered out prior to further analysis.
##### Create EdgeR DGEList object
y <- DGEList(counts=datasets, group=targetFile$Target)
##### Add datasets name for each sample
y$samples$dataset <- targetFile$Dataset
##### Filtering to remove low expressed genes. Users should filter with CPM rather than filtering on the counts directly, as the latter does not account for differences in library sizes between samples. Here we keep only genes that have CPM of 1
cat("The CPM of 1 (cut-off for removing low expressed genes) corresponds to", round(min(as.numeric(colSums(datasets)*1e-6)), digits=0), "reads in sample with the lowest sequencing depth, and", round(max(as.numeric(colSums(datasets)*1e-6)), digits=0), "reads in sample with the greatest sequencing depth\n")
The CPM of 1 (cut-off for removing low expressed genes) corresponds to 19 reads in sample with the lowest sequencing depth, and 88 reads in sample with the greatest sequencing depth
keep <- rowSums(cpm(y)>1) >= ncol(datasets)/10
y.filtered <- y[keep, , keep.lib.sizes=FALSE]
cat(nrow(y.filtered$counts), "genes remained after filtering out of the", nrow(datasets), "genes in the input read count matrix\n\n")
18836 genes remained after filtering out of the 56830 genes in the input read count matrix
##### Transformations from the raw-scale to CPM. Add small offset to each observation to avoid taking log of zero
y.cpm <- cpm(y, normalized.lib.sizes=TRUE, log=TRUE, prior.count=0.25)
y.filtered.cpm <- cpm(y.filtered, normalized.lib.sizes=TRUE, log=TRUE, prior.count=0.25)
Show the data distribution before and after low expressed genes filtering.
par(mfrow=c(1,2))
##### Before filtering
plot(density(y.cpm[,1]), lwd=2, ylim=c(0,0.25), las=2, main="", xlab="", col=datasets.colour[[2]][1])
title(main="Raw data", xlab="Log-cpm")
abline(v=0, lty=3)
for (i in 2:ncol(y.cpm)){
den <- density(y.cpm[,i])
lines(den$x, den$y, lwd=2, col=datasets.colour[[2]][i])
}
legend("topright", levels(targetFile$Dataset), fill=datasets.colour[[1]], bty="n")
##### After filtering
plot(density(y.filtered.cpm[,1]), lwd=2, ylim=c(0,0.25), las=2, main="", xlab="", col=datasets.colour[[2]][1])
title(main="Filtered data", xlab="Log-cpm")
abline(v=0, lty=3)
for (i in 2:ncol(y.filtered.cpm)){
den <- density(y.filtered.cpm[,i])
lines(den$x, den$y, lwd=2, col=datasets.colour[[2]][i])
}
legend("topright", levels(targetFile$Dataset), fill=datasets.colour[[1]], bty="n")
invisible(dev.off())
##### Save the plot as pdf file
pdf(paste0(normalizePath(params$projectDir), "/", paste(datasetIDs, collapse="_"),"_filtering.pdf"), width=8, height=5)
par(mfrow=c(1,2))
##### Before filtering
plot(density(y.cpm[,1]), lwd=2, ylim=c(0,0.25), las=2, main="", xlab="", col=datasets.colour[[2]][1])
title(main="Raw data", xlab="Log-cpm")
abline(v=0, lty=3)
for (i in 2:ncol(y.cpm)){
den <- density(y.cpm[,i])
lines(den$x, den$y, lwd=2, col=datasets.colour[[2]][i])
}
legend("topright", levels(targetFile$Dataset), fill=datasets.colour[[1]], bty="n")
##### After filtering
plot(density(y.filtered.cpm[,1]), lwd=2, ylim=c(0,0.25), las=2, main="", xlab="", col=datasets.colour[[2]][1])
title(main="Filtered data", xlab="Log-cpm")
abline(v=0, lty=3)
for (i in 2:ncol(y.filtered.cpm)){
den <- density(y.filtered.cpm[,i])
lines(den$x, den$y, lwd=2, col=datasets.colour[[2]][i])
}
legend("topright", levels(targetFile$Dataset), fill=datasets.colour[[1]], bty="n")
invisible(dev.off())
During the sample preparation or sequencing process, external factors that are not of biological interest can affect the expression of individual samples. For example, samples processed in the first batch of an experiment can have higher expression overall when compared to samples processed in a second batch. It is assumed that all samples should have a similar range and distribution of expression values. Normalisation for sample-specific effectss is required to ensure that the expression distributions of each sample are similar across the entire experiment. Normalisation by the method of trimmed mean of M-values (TMM) is performed using the calcNormFactors function in edgeR. The normalisation factors calculated here are used as a scaling factor for the library sizes. TMM is the recommended for most RNA-Seq data where the majority (more than half) of the genes are believed not differentially expressed between any pair of the samples.
Box-plots of log-cpm for individual samples, coloured by dataset, before and after TMM normalisation.
##### Adjust for RNA composition effect. Calculate scaling factors for the library sizes with calcNormFactors function using trimmed mean of M-values (TMM) between each pair of samples. Note, that the raw read counts are used to calculate the normalisation factors
y.norm <- calcNormFactors(y.filtered, method = "TMM")
##### Transformations from the raw-scale to CPM
y.norm.cpm <- cpm(y.norm, normalized.lib.sizes=TRUE, log=TRUE, prior.count=0.25)
##### Plot expression distribution of samples for unnormalised and normalised data
par(mfrow=c(2,1), mar=c(2, 5, 3, 2))
##### Unnormalised cpm data
boxplot(y.filtered.cpm, las=2, col=datasets.colour[[2]], main="", pch=".", las=3, xaxt="n")
title(main="Unnormalised data", ylab="Log-cpm")
legend("topright", levels(targetFile$Dataset), fill=datasets.colour[[1]], horiz=TRUE, bg="white", box.col="transparent")
##### Normalised cpm data
boxplot(y.norm.cpm, las=2, col=datasets.colour[[2]], main="", pch=".", las=3, xaxt="n")
title(main="Normalised data (TMM)", ylab="Log-cpm")
legend("topright", levels(targetFile$Dataset), fill=datasets.colour[[1]], horiz=TRUE, bg="white", box.col="transparent")
invisible(dev.off())
##### Save the plot as pdf file
pdf(paste0(normalizePath(params$projectDir), "/", paste(datasetIDs, collapse="_"),"_normalisation.pdf"), width=8, height=8)
par(mfrow=c(2,1), mar=c(2, 5, 3, 2))
##### Unnormalised cpm data
boxplot(y.filtered.cpm, las=2, col=datasets.colour[[2]], main="", pch=".", las=3, xaxt="n")
title(main="Unnormalised data", ylab="Log-cpm")
legend("topright", levels(targetFile$Dataset), fill=datasets.colour[[1]], horiz=TRUE, bg="white", box.col="transparent")
##### Normalised cpm data
boxplot(y.norm.cpm, las=2, col=datasets.colour[[2]], main="", pch=".", las=3, xaxt="n")
title(main="Normalised data (TMM)", ylab="Log-cpm")
legend("topright", levels(targetFile$Dataset), fill=datasets.colour[[1]], horiz=TRUE, bg="white", box.col="transparent")
invisible(dev.off())
# #######################################################
# ##### Prepare expression data frame for plotting in plotly by adding samples annotation and converting the matrix into two column data frame
# ##### Un-normalised cpm data
# y.filtered.cpm.df <- as.data.frame(cbind(rep(rownames(y.filtered.cpm), ncol(y.filtered.cpm)), rep(colnames(y.filtered.cpm), each=nrow(y.filtered.cpm)), rep(as.vector(y.filtered$samples$group), each=nrow(y.filtered.cpm)), as.numeric(y.filtered.cpm)))
# colnames(y.filtered.cpm.df) <- c("Gene", "Sample", "Group", "cpm")
#
# ##### Normalised cpm data
# y.norm.cpm.df <- as.data.frame(cbind(rep(rownames(y.norm.cpm), ncol(y.norm.cpm)), rep(colnames(y.norm.cpm), each=nrow(y.norm.cpm)), rep(as.vector(y.norm$samples$group), each=nrow(y.norm.cpm)), as.numeric(y.norm.cpm)))
# colnames(y.norm.cpm.df) <- c("Gene", "Sample", "Group", "cpm")
#
#
# p <- plot_ly(width = 800, height = 600)
#
# for ( i in 1:12 ) {
# p <- add_trace(p, y = y.filtered.cpm[,i], type = 'box', name = colnames(y.filtered.cpm)[i], jitter = 0.3, pointpos = 0, boxpoints = 'outliers',
# marker = list(color = 'rgb(9,56,125)'),
# line = list(color = 'rgb(9,56,125)'),
# showlegend=FALSE)
# }
#
# ##### Save the box-plot as html
# htmlwidgets::saveWidget(as_widget(p), paste0(paste(datasetIDs, collapse="_"),"_normalisation.html"), selfcontained = TRUE)
# #######################################################
The performance of the TMM normalisation procedure can be examined using mean-difference (MD) plots. This visualizes the library size-adjusted log-fold change between two libraries (the difference) against the average log-expression across those libraries (the mean). Each MD plot is generated by comparing one sample against an artificial library constructed from the average of all other samples. Ideally, the bulk of genes should be centred at a log-fold change of zero. This indicates that any composition bias between libraries has been successfully removed. The MD plots for all samples are saved in the [datasets] MD_plots directory.
##### Generate MD plot for the random 2 samples in each dataset (or 1 if 2 not available)
par(mfrow=c(length(datasetIDs),2), mar=c(2, 5, 3, 2))
for (dataset in datasetIDs ) {
##### Get sample names for each dataset
samples <- rownames(targetFile[ targetFile$Dataset == dataset, ])
##### Select random 2 samples (or 1 if 2 not available)
if ( length(samples) >= 2 ) {
samples <- samples[ sample(1:length(samples), 2, replace=FALSE) ]
} else {
samples <- samples[ sample(1:length(samples), length(samples), replace=FALSE) ]
}
for ( sample in samples ) {
plotMD(y.filtered.cpm, column=match(sample, colnames(y.filtered.cpm)))
abline(h=0, col="red", lty=2, lwd=2)
}
}
invisible(dev.off())
##### Create directory for pdf files
MDplotsDir <- paste0(normalizePath(params$projectDir), "/", paste(datasetIDs, collapse="_"), "_MD_plots")
if ( !file.exists(MDplotsDir) ){
dir.create(MDplotsDir, recursive=TRUE)
}
##### Save the plot as pdf file for each sample
for ( sample in colnames(y.filtered.cpm) ) {
pdf(paste0(MDplotsDir, "/", sample, "_MDplot.pdf"), width=6, height=5)
plotMD(y.filtered.cpm, column=match(sample, colnames(y.filtered.cpm)), xlim=c(-10,15), ylim=c(-10,10))
abline(h=0, col="red", lty=2, lwd=2)
invisible(dev.off())
}
Library sizes of individaul samples plotted against corresponsing scaling factors. The colours indicate datasets.
suppressMessages(library(plotly))
##### Plot the library sizes for individaul samples against corresponsing scaling factors
p <- plot_ly(y.norm$samples[, c("norm.factors", "lib.size", "dataset")], x = ~norm.factors, y = ~lib.size, text= ~rownames(y$samples), type='scatter', color = ~factor(dataset), colors = datasets.colour[[1]], mode = "markers", marker = list(size=6, symbol="circle"), width = 600, height = 400) %>%
layout(title = "", xaxis = list(title = "Scaling factor"), yaxis = list(title = "Library size"), margin = list(l=50, r=50, b=50, t=20, pad=4), autosize = F, showlegend = TRUE)
##### Print htmlwidget
p
##### Save the scatter-plot as html
htmlwidgets::saveWidget(as_widget(p), paste0(normalizePath(params$projectDir), "/", paste(datasetIDs, collapse="_"),"_scaling_factor_datasets.html"), selfcontained = TRUE)
Library sizes of individaul samples plotted against corresponsing scaling factors. The colours indicate sample groups (e.g. biological groups), as provided in Target column in the target files.
##### Plot the library sizes for individaul samples against corresponsing scaling factors
p <- plot_ly(y.norm$samples[, c("norm.factors", "lib.size", "group")], x = ~norm.factors, y = ~lib.size, text= ~rownames(y$samples), type='scatter', color = ~factor(group), colors = targets.colour[[1]], mode = "markers", marker = list(size=6, symbol="circle"), width = 600, height = 400) %>%
layout(title = "", xaxis = list(title = "Scaling factor"), yaxis = list(title = "Library size"), margin = list(l=50, r=50, b=50, t=20, pad=4), autosize = F, showlegend = TRUE)
##### Print htmlwidget
p
##### Save the scatter-plot as html
htmlwidgets::saveWidget(as_widget(p), paste0(normalizePath(params$projectDir), "/", paste(datasetIDs, collapse="_"),"_scaling_factor_targets.html"), selfcontained = TRUE)
##### Detach plotly package. Otherwise it clashes with other graphics devices
detach("package:plotly", unload=FALSE)
Before principal component analysis, we perform some exploratory data analysis such as PCA, relative log expression (RLE) plot, correlation matrix and scatter plot matrix. This procedure follows this tutorial and implements fucntions from the inSilicoMerging package.
Principal component analysis (PCA) reduces the dimensionality of data while retaining most of the variation in the dataset, making it possible to visually assess similarities and differences between samples and determine whether samples can be grouped. This exploratory analysis facilitates identification of the key factors affecting the variability in the expression data. Different colours represent datasets.
suppressMessages(library(plotly))
##### Keep only genes with variance > 0 across all samples
rsd <- apply(y.norm.cpm,1,sd)
y.norm.cpm.subset <- y.norm.cpm[rsd>0,]
##### Perform PCA
y.norm.cpm.subset_pca <- prcomp(t(y.norm.cpm.subset), scale=FALSE)
##### Get variance importance for all principal components
importance_pca <- summary(y.norm.cpm.subset_pca)$importance[2,]
importance_pca <- paste(round(100*importance_pca, 2), "%", sep="")
names(importance_pca) <- names(summary(y.norm.cpm.subset_pca)$importance[2,])
##### Prepare data frame
y.norm.cpm.subset_pca.df <- data.frame(targetFile$Target,targetFile$Dataset, y.norm.cpm.subset_pca$x[,"PC1"], y.norm.cpm.subset_pca$x[,"PC2"], y.norm.cpm.subset_pca$x[,"PC3"])
colnames(y.norm.cpm.subset_pca.df) <- c("Target", "Dataset", "PC1", "PC2", "PC3")
##### Generate PCA 2-D plot
p <- plot_ly(y.norm.cpm.subset_pca.df, x = ~PC1, y = ~PC2, color = ~Dataset, text=paste(targetFile$Dataset, rownames(y.norm.cpm.subset_pca.df), sep=": "), colors = datasets.colour[[1]], type='scatter', mode = "markers", marker = list(size=10, opacity = 0.7, symbol=datasets.symbols[[2]][order(targetFile$Dataset)]), width = 600, height = 400) %>%
layout(title = "", xaxis = list(title = paste( "PC1", " (",importance_pca["PC1"],")",sep="")), yaxis = list(title = paste( "PC2", " (",importance_pca["PC2"],")",sep="")), margin = list(l=50, r=50, b=50, t=20, pad=4), autosize = FALSE, showlegend = TRUE, legend = list(orientation = "v", y = 0.9))
p
##### Save PCA plot as html
htmlwidgets::saveWidget(as_widget(p), paste0(normalizePath(params$projectDir), "/", paste(datasetIDs, collapse="_"),"_PCA_2d_datasets.html"), selfcontained = TRUE)
##### Generate PCA 3-D plot
p <- plot_ly(y.norm.cpm.subset_pca.df, x = ~PC1, y = ~PC2, z = ~PC3, color = ~Dataset, text=paste(targetFile$Dataset, rownames(y.norm.cpm.subset_pca.df), sep=": "), colors = datasets.colour[[1]], type='scatter3d', mode = "markers", marker = list(size=8, opacity = 0.7, symbol=datasets.symbols[[2]][order(targetFile$Dataset)]), width = 600, height = 600) %>%
layout(title = "", xaxis = list(title = paste( "PC1", " (",importance_pca["PC1"],")",sep="")), yaxis = list(title = paste( "PC2", " (",importance_pca["PC2"],")",sep="")), zaxis = list(title = paste( "PC3", " (",importance_pca["PC3"],")",sep="")) , margin = list(l=50, r=50, b=50, t=10, pad=4), autosize = FALSE, showlegend = TRUE, legend = list(orientation = "v", y = 0.9))
p
##### Save PCA plot as html
htmlwidgets::saveWidget(as_widget(p), paste0(normalizePath(params$projectDir), "/", paste(datasetIDs, collapse="_"),"_PCA_3d_datasets.html"), selfcontained = TRUE)
Below we plot all combinations between the first 5 principal components (PCs), which may help in identifying which PCs explain the biological and/or batch effects.
##### Plot all combinations between the first 5 PCs
##### Preapre colours for plot exis
axis = list(showline=FALSE, zeroline=FALSE, gridcolor='#ffff', ticklen=5)
p <- as.data.frame(y.norm.cpm.subset_pca$x[,1:5]) %>%
plot_ly(width = 800, height = 800) %>%
add_trace( type = 'splom', dimensions = list(
list(label='PC1', values=~PC1), list(label='PC2', values=~PC2), list(label='PC3', values=~PC3), list(label='PC4', values=~PC4), list(label='PC5', values=~PC5)
),
text=paste(targetFile$Dataset, rownames(y.norm.cpm.subset_pca$x), sep=": "),
marker = list( color = datasets.colour[[2]], size = 7, opacity = 0.7
)
) %>%
layout(
title= "",
hovermode="closest",
plot_bgcolor="rgba(240,240,240, 0.95)",
xaxis=list(domain=NULL, showline=F, zeroline=F, gridcolor='#ffff', ticklen=4),
yaxis=list(domain=NULL, showline=F, zeroline=F, gridcolor='#ffff', ticklen=4),
xaxis2=axis, xaxis3=axis, xaxis4=axis, xaxis5=axis, yaxis2=axis, yaxis3=axis, yaxis4=axis, yaxis5=axis
)
p
##### Save pairwise PCA plot as html
htmlwidgets::saveWidget(as_widget(p), paste0(normalizePath(params$projectDir), "/", paste(datasetIDs, collapse="_"),"_PCA_pairwise_datasets.html"), selfcontained = TRUE)
##### Plot all combinations between the first 5 PCs
#pairs(y.norm.cpm.subset_pca$x[,1:5], pch=16, col=datasets.colour[[2]])
##### Save the plot as pdf file
#pdf(paste0(normalizePath(params$projectDir), "/", paste(datasetIDs, collapse="_"),"_PCA_pairwise_datasets.pdf"), width=10, height=10)
#pairs(y.norm.cpm.subset_pca$x[,1:5], pch=16, col=datasets.colour[[2]])
#invisible(dev.off())
The global variability of the data can also be assessed from the scree plot. Here, you can identify the fraction of total variance (y-axis) attributed to each PC (x-axis). The PCs are ordered by decreasing order of contribution to total variance.
##### Generate scree plot
##### Prepare data frame
y.norm.cpm.subset_scree.df <- data.frame(paste0("PC ", c(1:length(y.norm.cpm.subset_pca$sdev))), y.norm.cpm.subset_pca$sdev)
colnames(y.norm.cpm.subset_scree.df) <- c("PC", "Variances")
##### The default order will be alphabetized unless specified as below
y.norm.cpm.subset_scree.df$PC <- factor(y.norm.cpm.subset_scree.df$PC, levels = y.norm.cpm.subset_scree.df[["PC"]])
##### Generate scree plot
p <- plot_ly(y.norm.cpm.subset_scree.df, x = ~PC, y = ~Variances, type = 'bar', width = 600, height = 400) %>%
layout(title = "The variances captured by principal components", xaxis = list(title = ""), margin = list(l=50, r=50, b=100, t=100, pad=4), autosize = F)
p
##### Save scree plot as html
htmlwidgets::saveWidget(as_widget(p), paste0(normalizePath(params$projectDir), "/", paste(datasetIDs, collapse="_"),"_scree_plot.html"), selfcontained = TRUE)
##### Detach plotly package. Otherwise it clashes with other graphics devices
Principal component analysis (PCA) reduces the dimensionality of data while retaining most of the variation in the dataset, making it possible to visually assess similarities and differences between samples and determine whether samples can be grouped. This exploratory analysis facilitates identification of the key factors affecting the variability in the expression data. The colours indicate sample groups (e.g. biological groups), as provided in Target column in the target files.
##### Generate PCA 2-D plot
p <- plot_ly(y.norm.cpm.subset_pca.df, x = ~PC1, y = ~PC2, color = ~Target, text=paste(targetFile$Dataset, rownames(y.norm.cpm.subset_pca.df), sep=": "), colors = targets.colour[[1]], type='scatter', mode = "markers", marker = list(size=10, opacity = 0.7, symbol=datasets.symbols[[2]][order(targetFile$Target)]), width = 600, height = 400) %>%
layout(title = "", xaxis = list(title = paste( "PC1", " (",importance_pca["PC1"],")",sep="")), yaxis = list(title = paste( "PC2", " (",importance_pca["PC2"],")",sep="")), margin = list(l=50, r=50, b=50, t=20, pad=4), autosize = FALSE, showlegend = TRUE, legend = list(orientation = "v", y = 0.9))
p
##### Save PCA plot as html
htmlwidgets::saveWidget(as_widget(p), paste0(normalizePath(params$projectDir), "/", paste(datasetIDs, collapse="_"),"_PCA_2d_targets.html"), selfcontained = TRUE)
##### Generate PCA 3-D plot
p <- plot_ly(y.norm.cpm.subset_pca.df, x = ~PC1, y = ~PC2, z = ~PC3, color = ~Target, text=paste(targetFile$Dataset, rownames(y.norm.cpm.subset_pca.df), sep=": "), colors = targets.colour[[1]], type='scatter3d', mode = "markers", marker = list(size=8, opacity = 0.7, symbol=datasets.symbols[[2]][order(targetFile$Target)]), width = 600, height = 600) %>%
layout(title = "", xaxis = list(title = paste( "PC1", " (",importance_pca["PC1"],")",sep="")), yaxis = list(title = paste( "PC2", " (",importance_pca["PC2"],")",sep="")), zaxis = list(title = paste( "PC3", " (",importance_pca["PC3"],")",sep="")) , margin = list(l=50, r=50, b=50, t=10, pad=4), autosize = FALSE, showlegend = TRUE, legend = list(orientation = "v", y = 0.9))
p
##### Save PCA plot as html
htmlwidgets::saveWidget(as_widget(p), paste0(normalizePath(params$projectDir), "/", paste(datasetIDs, collapse="_"),"_PCA_3d_targets.html"), selfcontained = TRUE)
Below we plot all combinations between the first 5 principal components (PCs), which may help in identifying which PCs explain the biological and/or batch effects.
##### Plot all combinations between the first 5 PCs
p <- as.data.frame(y.norm.cpm.subset_pca$x[,1:5]) %>%
plot_ly(width = 800, height = 800) %>%
add_trace( type = 'splom', dimensions = list(
list(label='PC1', values=~PC1), list(label='PC2', values=~PC2), list(label='PC3', values=~PC3), list(label='PC4', values=~PC4), list(label='PC5', values=~PC5)
),
text=paste(targetFile$Target, rownames(y.norm.cpm.subset_pca$x), sep=": "),
marker = list( color = targets.colour[[2]], size = 7, opacity = 0.7
)
) %>%
layout(
title= "",
hovermode="closest",
plot_bgcolor="rgba(240,240,240, 0.95)",
xaxis=list(domain=NULL, showline=F, zeroline=F, gridcolor='#ffff', ticklen=4),
yaxis=list(domain=NULL, showline=F, zeroline=F, gridcolor='#ffff', ticklen=4),
xaxis2=axis, xaxis3=axis, xaxis4=axis, xaxis5=axis, yaxis2=axis, yaxis3=axis, yaxis4=axis, yaxis5=axis, legend = list(orientation = "v", y = 0.9)
)
p
##### Save pairwise PCA plot as html
htmlwidgets::saveWidget(as_widget(p), paste0(normalizePath(params$projectDir), "/", paste(datasetIDs, collapse="_"),"_PCA_pairwise_targets.html"), selfcontained = TRUE)
##### Plot all combinations between the first 5 PCs
#pairs(y.norm.cpm.subset_pca$x[,1:5], pch=16, col=targets.colour[[2]])
##### Save the plot as pdf file
#pdf(paste0(normalizePath(params$projectDir), "/", paste(datasetIDs, collapse="_"),"_PCA_pairwise_targets.pdf"), width=10, height=10)
#pairs(y.norm.cpm.subset_pca$x[,1:5], pch=16, col=targets.colour[[2]])
#invisible(dev.off())
The global variability of the data can also be assessed from the scree plot. Here, you can identify the fraction of total variance (y-axis) attributed to each PC (x-axis). The PCs are ordered by decreasing order of contribution to total variance.
##### Generate scree plot
p <- plot_ly(y.norm.cpm.subset_scree.df, x = ~PC, y = ~Variances, type = 'bar', width = 600, height = 400) %>%
layout(title = "The variances captured by principal components", xaxis = list(title = ""), margin = list(l=50, r=50, b=100, t=100, pad=4), autosize = F)
p
##### Save scree plot as html
htmlwidgets::saveWidget(as_widget(p), paste0(normalizePath(params$projectDir), "/", paste(datasetIDs, collapse="_"),"_scree_plot.html"), selfcontained = TRUE)
##### Detach plotly package. Otherwise it clashes with other graphics devices
detach("package:plotly", unload=FALSE)
Neet to benchmark batch effects correction/modelling methods, including removeBatchEffect, RUVSeq, MBatch/EB++, DESeq2, sva after realising that Combat sucks (check this and that)
cat("Writing TMM-normalised expression data to", paste0(paste(datasetIDs, collapse="_"),"_TMM.exp"), "\n\n")
Writing TMM-normalised expression data to TCGA-PAAD_TCGA-BRCA_TCGA-HNSC_TMM.exp
write.table(geneMatrix2write(y.norm.cpm), file=paste0(normalizePath(params$projectDir), "/", paste(datasetIDs, collapse="_"),"_TMM.txt"),sep="\t", quote=FALSE, row.names=FALSE, append = FALSE)
for ( i in 1:length(params) ) {
cat(paste("Parameter: ", names(params)[i], "\nValue: ", paste(unlist(params[i]), collapse = ","), "\n\n", sep=""))
}
Parameter: projectDir
Value: /Users/jmarzec/Documents/GitHub/UMCCR/RNAseq-Analysis-Report/data
Parameter: datasetsFile
Value: Datasets_list_breast_head_and_neck_group.txt
##### Write used parameters into a file
write.table(params, file = paste0(normalizePath(params$projectDir), "/", paste(datasetIDs, collapse="_"),".combineExprData.parameters.txt"), sep="\t", quote=FALSE, row.names=FALSE, append = FALSE)
sessionInfo()
R version 3.5.0 (2018-04-23)
Platform: x86_64-apple-darwin15.6.0 (64-bit)
Running under: macOS High Sierra 10.13.6
Matrix products: default
BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
locale:
[1] en_AU.UTF-8/en_AU.UTF-8/en_AU.UTF-8/C/en_AU.UTF-8/en_AU.UTF-8
attached base packages:
[1] stats4 parallel stats graphics grDevices utils datasets
[8] methods base
other attached packages:
[1] bindrcpp_0.2.2 RSkittleBrewer_1.1
[3] corrplot_0.84 ggplot2_3.0.0
[5] FactoMineR_1.41 sneer_0.0.0.9028
[7] RUVSeq_1.14.0 edgeR_3.22.3
[9] limma_3.36.2 EDASeq_2.14.1
[11] ShortRead_1.38.0 GenomicAlignments_1.16.0
[13] SummarizedExperiment_1.10.1 DelayedArray_0.6.5
[15] matrixStats_0.54.0 Rsamtools_1.32.2
[17] GenomicRanges_1.32.6 GenomeInfoDb_1.16.0
[19] Biostrings_2.48.0 XVector_0.20.0
[21] IRanges_2.14.10 S4Vectors_0.18.3
[23] Biobase_2.40.0 BiocGenerics_0.26.0
[25] sva_3.28.0 BiocParallel_1.14.2
[27] genefilter_1.62.0 mgcv_1.8-24
[29] nlme_3.1-137 rapportools_1.0
[31] reshape_0.8.7 preprocessCore_1.42.0
[33] optparse_1.6.0
loaded via a namespace (and not attached):
[1] colorspace_1.3-2 hwriter_1.3.2 rprojroot_1.3-2
[4] getopt_1.20.2 bit64_0.9-7 AnnotationDbi_1.42.1
[7] splines_3.5.0 R.methodsS3_1.7.1 leaps_3.0
[10] DESeq_1.32.0 geneplotter_1.58.0 knitr_1.20
[13] jsonlite_1.5 annotate_1.58.0 cluster_2.0.7-1
[16] R.oo_1.22.0 shiny_1.1.0 compiler_3.5.0
[19] httr_1.3.1 backports_1.1.2 assertthat_0.2.0
[22] Matrix_1.2-14 lazyeval_0.2.1 later_0.7.3
[25] htmltools_0.3.6 prettyunits_1.0.2 tools_3.5.0
[28] gtable_0.2.0 glue_1.3.0 GenomeInfoDbData_1.1.0
[31] dplyr_0.7.6 Rcpp_0.12.18 rtracklayer_1.40.4
[34] crosstalk_1.0.0 stringr_1.3.1 mime_0.5
[37] XML_3.98-1.16 zlibbioc_1.26.0 MASS_7.3-50
[40] scales_1.0.0 aroma.light_3.10.0 promises_1.0.1
[43] hms_0.4.2 RColorBrewer_1.1-2 yaml_2.2.0
[46] memoise_1.1.0 pander_0.6.2 biomaRt_2.36.1
[49] latticeExtra_0.6-28 stringi_1.2.4 RSQLite_2.1.1
[52] GenomicFeatures_1.32.2 rlang_0.2.2 pkgconfig_2.0.2
[55] bitops_1.0-6 evaluate_0.11 lattice_0.20-35
[58] purrr_0.2.5 bindr_0.1.1 htmlwidgets_1.2
[61] bit_1.1-14 tidyselect_0.2.4 plyr_1.8.4
[64] magrittr_1.5 R6_2.2.2 DBI_1.0.0
[67] pillar_1.3.0 withr_2.1.2 survival_2.42-6
[70] scatterplot3d_0.3-41 RCurl_1.95-4.11 tibble_1.4.2
[73] crayon_1.3.4 plotly_4.8.0 rmarkdown_1.10
[76] progress_1.2.0 locfit_1.5-9.1 grid_3.5.0
[79] data.table_1.11.4 blob_1.1.1 digest_0.6.15
[82] flashClust_1.01-2 xtable_1.8-2 httpuv_1.4.5
[85] tidyr_0.8.1 R.utils_2.6.0 munsell_0.5.0
[88] viridisLite_0.3.0